[FRAUD] 데이터정리 시도(8.25_df02 커널 죽음)

Author

김보람

Published

August 25, 2023

import

import numpy as np
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import torch

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics 

# embedding 
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부     
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G


def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")   
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")  
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")

    return G
    
    
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def embedding(Graph):
    # Graph -> X (feature)
    _edgs = list(Graph.edges)
    subGraph = Graph.edge_subgraph([_edgs[x] for x in range(len(Graph.edges))]).copy()
    subGraph.add_nodes_from(list(set(Graph.nodes) - set(subGraph.nodes)))    
    embedded = AverageEmbedder(Node2Vec(subGraph, weight_key='weight').fit(window=10).wv)
    X = [embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in range(len(Graph.edges))]
    # Graph -> y (label)
    y = np.array(list(nx.get_edge_attributes(Graph, "label").values()))
    return X,y 

def anal(df):
    Graph = build_graph_bipartite(df)
    X,XX,y,yy = embedding(Graph)
    lrnr = RandomForestClassifier(n_estimators=100, random_state=42) 
    lrnr.fit(X,y)
    yyhat = lrnr.predict(XX)
    df = pd.DataFrame({
        'acc':[sklearn.metrics.accuracy_score(yy,yyhat)], 
        'pre':[sklearn.metrics.precision_score(yy,yyhat)], 
        'rec':[sklearn.metrics.recall_score(yy,yyhat)],
        'f1':[sklearn.metrics.f1_score(yy,yyhat)]}
    )    
    return df

def our_sampling1(df):
    cus_list = set(df.query('is_fraud==1').cc_num.tolist())
    return df.query("cc_num in @ cus_list")
fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain

시도

_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape
214520*214520
df02.is_fraud.mean().round(5)
  • 사기거래 빈도..
df02 = df02.reset_index()
N = len(df02)

tr/test

df02_tr,df02_test = sklearn.model_selection.train_test_split(df02, random_state=42)
df02_tr.is_fraud.mean().round(5), df02_test.is_fraud.mean().round(5)
df02_tr.shape, df02_test.shape
train_mask =  [i in df02_tr.index for i in range(N)] 
test_mask =  [i in df02_test.index for i in range(N)] 
np.array(train_mask).sum(), np.array(test_mask).sum()

데이터 돌아가는중..(일단 여기서 df02가 커널 죽음)!!! _ 내일 다시 해보자

import time 
t1= time.time()
edge_index_list_plus = []
_cc_num = np.array(df02['cc_num'])
_trans_date_trans_time=np.array(df02['trans_date_trans_time'].apply(lambda x: x.value))
for i in range(N):
    for j in range(N):
        if _cc_num[i] != _cc_num[j]:  # cc_num 값이 다르다면
            time_difference = 0
        else:
            time_difference = abs(_trans_date_trans_time[i] - _trans_date_trans_time[j])
        edge_index_list_plus.append([i, j, time_difference])
edge_index_list_plus_nparr =np.array(edge_index_list_plus)
np.save('edge_index_list_plus02.npy', edge_index_list_plus_nparr)
t2= time.time()
t2-t1

데이터 돌아가는중…………………. 다시다시

edge_index = np.array(edge_index_list_plus)
edge_index.shape
edge_index
edge_index[:,2] = np.abs(edge_index[:,2])
theta = edge_index[:,2].mean()
theta
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)
edge_index
edge_index_list_updated = edge_index.tolist()
np.array(edge_index_list_updated)[:,2].mean()
mm = np.array(edge_index_list_updated)[:,2].mean()

시간이 평균보다 짧다면? . 음..

selected_edges = [(int(row[0]), int(row[1])) for row in edge_index_list_updated if row[2] > mm]
edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()
edge_index_selected.shape

data설정(x, edge_index, y)

x = df50_com['amt']
a = torch.tensor(x, dtype=torch.float)
a = a.reshape(-1,1)
a
y = df50_com['is_fraud']
b = torch.tensor(y,dtype=torch.int64)
b
import torch_geometric
data = torch_geometric.data.Data(x=a, edge_index = edge_index_selected, y=b, train_mask = train_mask, test_mask = test_mask)
data


gnn



import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 16)
        self.conv2 = GCNConv(16,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)
model = GCN()
model
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()

for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')
Accuracy: 0.9321
`